library(tidyverse)
movies <- read.csv("archive/tmdb_5000_movies.csv")
head(movies)
names(movies)
1
First thoughts - homepage - missing values - revenue - 0 value - few factor types
2
movies_selected <-movies %>%
select(title,runtime, budget)
movies_selected
3
# count the number of missing values in each column
movies_selected %>%
summarise(across(.fns = ~sum(is.na(.x))))
movies_selected %>%
summarise(na_runtime = sum(is.na(runtime)),
na_title = sum(is.na(title)),
na_budget = sum(is.na(budget)))
4
movies_runtime <- movies_selected %>%
mutate(runtime = na_if(runtime, 0))
movies_runtime %>%
summarise(count_missing = sum(is.na(runtime)))
5
movies_imputed <- movies_runtime %>%
mutate(runtime = coalesce(runtime, median(runtime, na.rm = TRUE)))
movies_imputed %>%
summarise(count_missing = sum(is.na(runtime)))
6
movies_imputed %>%
slice_min(runtime, n = 10)
7
# Overwrites movies_imputed to contain new budget values
movies_imputed <- movies_imputed %>%
mutate(budget = if_else(budget < 100, median(budget), budget))
Extension 1
movie_budgets <- movies_imputed %>%
mutate(budget_type = case_when(budget < 12e6 ~ "Small budget",
budget < 40e6 ~ "Meium budget", #Budgets between 12 and 40 million as ‘Medium budget’
#budget > 12e6 & budget < 40e6 ~ "Medium budget",
budget >= 40e6 ~ "Big budget"))
movie_budgets
2
movies %>%
summarise(across(where(is.numeric), ~sum(is.na(.x))))
# summarises the number of missing values cross all columns of type character
movies %>%
summarise(across(where(is.character), ~sum(is.na(.x))))
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CmxpYnJhcnkodGlkeXZlcnNlKQoKbW92aWVzIDwtIHJlYWQuY3N2KCJhcmNoaXZlL3RtZGJfNTAwMF9tb3ZpZXMuY3N2IikKYGBgCmBgYHtyfQpoZWFkKG1vdmllcykKCm5hbWVzKG1vdmllcykKYGBgCgojIDEKRmlyc3QgdGhvdWdodHMKLSBob21lcGFnZSAtIG1pc3NpbmcgdmFsdWVzCi0gcmV2ZW51ZSAtIDAgdmFsdWUKLSBmZXcgZmFjdG9yIHR5cGVzCgojIDIKCmBgYHtyfQptb3ZpZXNfc2VsZWN0ZWQgPC1tb3ZpZXMgJT4lIAogIHNlbGVjdCh0aXRsZSxydW50aW1lLCBidWRnZXQpCm1vdmllc19zZWxlY3RlZApgYGAKCiMgMwoKYGBge3J9CiMgY291bnQgdGhlIG51bWJlciBvZiBtaXNzaW5nIHZhbHVlcyBpbiBlYWNoIGNvbHVtbgojIGZ1bmN0aW9uYWwgcHJvZ3JhbW1pbmcKbW92aWVzX3NlbGVjdGVkICU+JSAKICBzdW1tYXJpc2UoYWNyb3NzKC5mbnMgPSB+c3VtKGlzLm5hKC54KSkpKQpgYGAKCmBgYHtyfQptb3ZpZXNfc2VsZWN0ZWQgJT4lIAogIHN1bW1hcmlzZShuYV9ydW50aW1lID0gc3VtKGlzLm5hKHJ1bnRpbWUpKSwKICAgICAgICAgICAgbmFfdGl0bGUgPSBzdW0oaXMubmEodGl0bGUpKSwKICAgICAgICAgICAgbmFfYnVkZ2V0ID0gc3VtKGlzLm5hKGJ1ZGdldCkpKQpgYGAKCgojIDQKCmBgYHtyfQojIGNvbnZlcnRzIHJ1bnRpbWVzIDAgaW50byBOQXMKCm1vdmllc19ydW50aW1lIDwtIG1vdmllc19zZWxlY3RlZCAlPiUKICBtdXRhdGUocnVudGltZSA9IG5hX2lmKHJ1bnRpbWUsIDApKQoKIyBjb3VudHMgbWlzc2luZyB2YWx1ZXMgaW4gY29sdW1uIHJ1bnRpbWUKCm1vdmllc19ydW50aW1lICU+JSAKICBzdW1tYXJpc2UoY291bnRfbWlzc2luZyA9IHN1bShpcy5uYShydW50aW1lKSkpCmBgYAoKIyA1CgpgYGB7cn0KIyByZXByZXNlbnQgdGhlIG1pc3NpbmcgdmFsdWVzIGluIHJ1bnRpbWUgd2l0aCB0aGUgbWVkaWFuIHJ1bnRpbWUgCgptb3ZpZXNfaW1wdXRlZCA8LSBtb3ZpZXNfcnVudGltZSAlPiUgCiAgbXV0YXRlKHJ1bnRpbWUgPSBjb2FsZXNjZShydW50aW1lLCBtZWRpYW4ocnVudGltZSwgbmEucm0gPSBUUlVFKSkpCgpgYGAKYGBge3J9Cm1vdmllc19pbXB1dGVkICU+JSAKICBzdW1tYXJpc2UoY291bnRfbWlzc2luZyA9IHN1bShpcy5uYShydW50aW1lKSkpCmBgYAoKIyA2IAoKYGBge3J9CiMgZ2V0cyB0aGUgbW92aWVzIHdpdGggdGhlIGxvbmdlc3QgcnVudGltZXMKCm1vdmllc19pbXB1dGVkICU+JSAKICBzbGljZV9tYXgocnVudGltZSwgbiA9IDEwKQpgYGAKYGBge3J9CiMgIyBnZXRzIHRoZSBtb3ZpZXMgd2l0aCB0aGUgc2hvcnRlc3QgcnVudGltZXMKCm1vdmllc19pbXB1dGVkICU+JSAKICBzbGljZV9taW4ocnVudGltZSwgbiA9IDEwKQpgYGAKCiMgNwoKYGBge3J9CiMgT3ZlcndyaXRlcyBtb3ZpZXNfaW1wdXRlZCB0byBjb250YWluIG5ldyBidWRnZXQgdmFsdWVzCgptb3ZpZXNfaW1wdXRlZCA8LSBtb3ZpZXNfaW1wdXRlZCAlPiUKICBtdXRhdGUoYnVkZ2V0ID0gaWZfZWxzZShidWRnZXQgPCAxMDAsIG1lZGlhbihidWRnZXQpLCBidWRnZXQpKQpgYGAKCgojIEV4dGVuc2lvbiAxCgpgYGB7cn0KbW92aWVfYnVkZ2V0cyA8LSBtb3ZpZXNfaW1wdXRlZCAlPiUgCiAgbXV0YXRlKGJ1ZGdldF90eXBlID0gY2FzZV93aGVuKGJ1ZGdldCA8IDEyZTYgfiAiU21hbGwgYnVkZ2V0IiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgYnVkZ2V0IDwgNDBlNiB+ICJNZWl1bSBidWRnZXQiLCAjQnVkZ2V0cyBiZXR3ZWVuIDEyIGFuZCA0MCBtaWxsaW9uIGFzIOKAmE1lZGl1bSBidWRnZXTigJkKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgI2J1ZGdldCA+IDEyZTYgJiBidWRnZXQgPCA0MGU2IH4gIk1lZGl1bSBidWRnZXQiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBidWRnZXQgPj0gNDBlNiB+ICJCaWcgYnVkZ2V0IikpCm1vdmllX2J1ZGdldHMKYGBgCiMgMgpgYGB7cn0KIyBzdW1tYXJpc2VzIHRoZSBudW1iZXIgb2YgbWlzc2luZyB2YWx1ZXMgY3Jvc3MgYWxsIGNvbHVtbnMgb2YgdHlwZSBudW1lcmljCgptb3ZpZXMgJT4lCiAgc3VtbWFyaXNlKGFjcm9zcyh3aGVyZShpcy5udW1lcmljKSwgfnN1bShpcy5uYSgueCkpKSkKYGBgCgoKYGBge3J9CiMgc3VtbWFyaXNlcyB0aGUgbnVtYmVyIG9mIG1pc3NpbmcgdmFsdWVzIGNyb3NzIGFsbCBjb2x1bW5zIG9mIHR5cGUgY2hhcmFjdGVyCgptb3ZpZXMgJT4lCiAgc3VtbWFyaXNlKGFjcm9zcyh3aGVyZShpcy5jaGFyYWN0ZXIpLCB+c3VtKGlzLm5hKC54KSkpKQ==